qPCR data QC

Author

Laura Symul & Laura Vermeren

Published

October 20, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)

tmp <- fs::dir_map("R scripts/", source)
tmp <- fs::dir_map("../VIBRANT-99-utils/R/", source)
rm(tmp)

theme_set(theme_light())

The purpose of this document is to perform various quality check on the qPCR data generated on “weekly swabs” from the VIBRANT clinical study. “Weekly swabs” are swab samples collected at their weekly in-clinic visits.

Loading the data

Code
qpcr_dir <- str_c(get_data_dir(), "03 qPCR/")
file <- "VIBRANT_qPCR_data_merged_250611.csv"
cat("We load the file", file, " (15 LBP strains)\n\n")
We load the file VIBRANT_qPCR_data_merged_250611.csv  (15 LBP strains)
Code
qpcr_LBP <- read_csv(str_c(qpcr_dir, file))

file <- "VIBRANT_16SqPCR_250616.csv"
cat("\n\nAnd the file ", file, " (16S)\n\n")


And the file  VIBRANT_16SqPCR_250616.csv  (16S)
Code
qpcr_16S <- read_csv(str_c(qpcr_dir, file))

rm(qpcr_dir, file)

The data is provided in long format:

Code
qpcr_LBP |> glimpse()
Rows: 54,810
Columns: 21
$ Data_row                 <dbl> 2275, 2276, 2277, 1603, 1604, 1605, 2272, 227…
$ Well                     <chr> "E14", "E15", "E16", "E14", "E15", "E16", "E1…
$ Fluor                    <chr> "HEX", "HEX", "HEX", "Cy5", "Cy5", "Cy5", "FA…
$ Content                  <chr> "Unkn-23", "Unkn-23", "Unkn-23", "Unkn-23", "…
$ Replicate                <dbl> 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 2…
$ Sample                   <chr> "C5", "C5", "C5", "C5", "C5", "C5", "C5", "C5…
$ Cq                       <dbl> 30.91797, 30.97083, 30.74273, NA, NA, NA, NA,…
$ `Starting Quantity (SQ)` <dbl> 97.7864496, 94.2051474, 110.6643855, NA, NA, …
$ `Cq Mean`                <dbl> 30.87718, 30.87718, 30.87718, 0.00000, 0.0000…
$ plate_id                 <chr> "B063809", "B063809", "B063809", "B063810", "…
$ Plate_number             <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1…
$ gDNA_Plate_ID            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VMRC_Group               <chr> "VMRC_4", "VMRC_4", "VMRC_4", "VMRC_3", "VMRC…
$ Group_Fluor              <chr> "VMRC_4_HEX", "VMRC_4_HEX", "VMRC_4_HEX", "VM…
$ Target                   <chr> "122010", "122010", "122010", "185329", "1853…
$ gDNA_Plate_Well          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VIBR_Sample_ID           <chr> "551", "551", "551", "551", "551", "551", "55…
$ InRedos                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ Dilution_Factor          <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
$ Quant_Adjusted           <dbl> 488.932248, 471.025737, 553.321928, 0.000000,…
$ Copies_per_swab          <dbl> 122233.0620, 117756.4343, 138330.4819, 0.0000…

Merging 16S and LBP qPCR data

Since the two files have almost exactly the same columns, we merge them into a single file after some harmonization.

Note that there is one sample that has data for the LBP qPCR, but not for the 16S qPCR.

Code
dplyr::full_join(
  qpcr_16S |> select(VIBR_Sample_ID) |> mutate(has_16S = TRUE) |> distinct(),
  qpcr_LBP |> select(VIBR_Sample_ID) |> mutate(has_LBP = TRUE) |> distinct()
) |>  dplyr::filter(is.na(has_16S) | is.na(has_LBP)) |> 
  dplyr::filter(!is.na(VIBR_Sample_ID), !(VIBR_Sample_ID %in% c("#N/A", "empty"))) |> 
  mutate(index = row_number()) |> 
  select(index, everything()) |> 
  gt(
    caption = "Clinical sample with data for LBP qPCR but not for 16S qPCR, or vice versa."
  )
Clinical sample with data for LBP qPCR but not for 16S qPCR, or vice versa.
index VIBR_Sample_ID has_16S has_LBP
1 2171048 NA TRUE

We also note that there is one sample on the same plate that has 6 replicates instead of 3 in the 16S qPCR.

Code
qpcr_16S |> dplyr::filter(!is.na(VIBR_Sample_ID)) |> dplyr::count(VIBR_Sample_ID, gDNA_Plate_ID ) |> dplyr::filter(n > 3)
# A tibble: 1 × 3
  VIBR_Sample_ID gDNA_Plate_ID     n
  <chr>                  <dbl> <int>
1 2167612              2240890     6
Code
qpcr_16S |> dplyr::filter(VIBR_Sample_ID == "2167612") |> select(Data_row, Well, Sample, Cq, Copies_per_swab)
# A tibble: 6 × 5
  Data_row Well  Sample    Cq Copies_per_swab
     <dbl> <chr> <chr>  <dbl>           <dbl>
1      163 I08   E4      22.4      218362435.
2      178 J07   E4      22.4      213446847.
3      179 J08   E4      22.4      222995668.
4      242 M10   G5      22.6      181004723.
5      258 N09   G5      22.5      195241505.
6      259 N10   G5      22.8      157625548.

As discussed with Michael, this is a mislabeling and these 6 are 3 replicates for 2 different samples. Since we have identified this issue already, we will fix it later in the code, using the extraction library plate number and position.

Code
# We rename the standards in the 16S qPCR so they are numbered
std_16s <-
  qpcr_16S |> 
  dplyr::filter(str_detect(Content, "Std")) |> 
  select(Content, `Starting Quantity (SQ)`) |> 
  distinct() |> 
  arrange(`Starting Quantity (SQ)` |> desc()) |>
  mutate(std_nb = row_number())
  
qpcr_16S <- 
  qpcr_16S |> 
  dplyr::left_join(std_16s, by = join_by(Content, `Starting Quantity (SQ)`)) |> 
  mutate(
    Content = Content |> str_c("-", std_nb |> str_pad(width = 2, pad = "0")),
    Target = "16S"
        ) |> 
  select(-std_nb)

# We transform the VIBR_Sample_ID column to be consistent with the 16S qPCR
qpcr_LBP <- 
  qpcr_LBP |> 
  mutate(
    VIBR_Sample_ID = ifelse(VIBR_Sample_ID == "#N/A", NA_character_, VIBR_Sample_ID)
  )

# we merge the two datasets
qpcr <- 
  bind_rows(
    qpcr_LBP |> mutate(target_type = "LBP"), 
    qpcr_16S |> mutate(target_type = "16S rRNA")
    ) |> 
  relocate(target_type, .after = Target)
Code
dictionary <- 
  tibble(original_name = colnames(qpcr)) |> 
  mutate(
    description = 
      case_when(
        (original_name == "Data_row") ~ 'Table row number',
        (original_name == "Well") ~ str_c("qPCR plate well ID. From ", min(qpcr$Well), " to ", max(qpcr$Well)),
        (original_name == "Fluor") ~ str_c('fluorescent dye used (one of ', qpcr$Fluor |> unique() |> str_c(collapse = ", "), ")"),
        (original_name == "Content") ~ 'Indicates whether well contains a sample, a standard, or a negative control.',
        (original_name == "Replicate") ~ 
          str_c(
            "The replicate number from ", min(qpcr$Replicate, na.rm = TRUE), " to ", max(qpcr$Replicate, na.rm = TRUE), 
            " (with ", sum(is.na(qpcr$Replicate)), " missing values). Values are repeated 495, or 990 times."
          ),
        (original_name == "Sample") ~ 'A qPCR plate-specific sample ID. This column wont be used or kept for downstream analyses.',
        (original_name == "Cq") ~ 'Quantification cycle (Cq) value.',
        (original_name == "Starting Quantity (SQ)") ~ 'Starting quantity (SQ) value; computed from the Cq values using the inverse calibration function (calibration curve fitted using the expected SQ values for the standards).',
        (original_name == "Cq Mean") ~ 'Mean across replicates',
        (original_name == "plate_id") ~ str_c('qPCR plate ID; there are ', qpcr$plate_id |> unique() |> length()," unique plate IDs in total."),
        (original_name == "Plate_number") ~ str_c("The extraction plate number; there are ", qpcr$Plate_number |> unique() |> length()," unique plate numbers in total (5 identical `plate_id` per `Plate_number`)."),
        (original_name == "gDNA_Plate_ID") ~ 'The extraction plate ID',
        (original_name == "VMRC_Group") ~ str_c('The group of the LBP strains. There are ', qpcr$VMRC_Group |> unique() |> na.omit() |> length()," unique VMRC groups in total (", qpcr$VMRC_Group |> unique() |> na.omit() |> sort() |> str_c(collapse = ", "), ")"),
        (original_name == "Group_Fluor") ~ 'Concatenation of the `VMRC_Group` and `Fluor` columns',
        (original_name == "Target") ~ str_c('The target "gene" (here taxa/strain). There are ', qpcr$Target |> unique() |> length(), " unique targets in total (", qpcr$Target |> unique() |> str_c(collapse = ", "),")"),
        (original_name == "target_type") ~ "Whether the target is a LBP strain or the 16S rRNA gene target",
        (original_name == "gDNA_Plate_Well") ~ str_c('The concatenation of the gDNA plate ID and the plate well; there are ', qpcr$gDNA_Plate_Well |> unique() |> length()," unique gDNA plate wells in total in the format `[gDNA_Plate_ID]_[Well]` (e.g., ", qpcr$gDNA_Plate_Well |> unique() |> extract(10),")"),
        (original_name == "VIBR_Sample_ID") ~ str_c("VIBRANT sample ID; there are ", qpcr$VIBR_Sample_ID |> unique() |> length(), " unique VIBRANT sample IDs in total)"),
        (original_name == "Dilution_Factor") ~ str_c('The qPCR plate specific dilution factor. There are ', qpcr$Dilution_Factor |> unique() |> length()," unique dilution factors in total (", qpcr$Dilution_Factor |> unique()  |> sort() |> str_c(collapse = ", "),")"),
        (original_name == "Quant_Adjusted") ~ str_c("Quantification value adjusted by the dilution factor; range from ", min(qpcr$Quant_Adjusted, na.rm = TRUE)," to ", max(qpcr$Quant_Adjusted, na.rm = TRUE)),
        (original_name == "Copies_per_swab") ~ str_c("The number of target copies per swab, computed from the adjusted quantification values. Ranges from ", min(qpcr$Copies_per_swab, na.rm = TRUE)," to ", max(qpcr$Copies_per_swab, na.rm = TRUE)),
         (original_name == "InRedos") ~ str_c("Whether the sample was part of the samples that were redone (= re-extracted)."),
        TRUE ~ "????"
      )
  )
Code
# dictionary |> gt()

Tidy names

We tidy the column names for consistency and readability.

Code
qpcr <- qpcr |> janitor::clean_names()
qpcr <- qpcr |> 
  dplyr::rename(
    strain_group_qpcr = vmrc_group,
    starting_quantity = starting_quantity_sq,
    pcr_plate_id = plate_id,
    ext_lib_plate_nb = plate_number,
    ext_lib_plate_id = g_dna_plate_id,
    ext_lib_position = g_dna_plate_well
  )
Code
dictionary <- 
  dictionary |> 
  mutate(name = colnames(qpcr)) |> 
  select(name, everything())

The new column names are:

Code
dictionary |> 
  gt() |> 
  cols_label(
    name = "New column name",
    original_name = "Original column name",
    description = "Description"
  ) 
New column name Original column name Description
data_row Data_row Table row number
well Well qPCR plate well ID. From A01 to P24
fluor Fluor fluorescent dye used (one of HEX, Cy5, FAM, SYBR)
content Content Indicates whether well contains a sample, a standard, or a negative control.
replicate Replicate The replicate number from 1 to 96 (with 4193 missing values). Values are repeated 495, or 990 times.
sample Sample A qPCR plate-specific sample ID. This column wont be used or kept for downstream analyses.
cq Cq Quantification cycle (Cq) value.
starting_quantity Starting Quantity (SQ) Starting quantity (SQ) value; computed from the Cq values using the inverse calibration function (calibration curve fitted using the expected SQ values for the standards).
cq_mean Cq Mean Mean across replicates
pcr_plate_id plate_id qPCR plate ID; there are 72 unique plate IDs in total.
ext_lib_plate_nb Plate_number The extraction plate number; there are 12 unique plate numbers in total (5 identical `plate_id` per `Plate_number`).
ext_lib_plate_id gDNA_Plate_ID The extraction plate ID
strain_group_qpcr VMRC_Group The group of the LBP strains. There are 5 unique VMRC groups in total (VMRC_1, VMRC_2, VMRC_3, VMRC_4, VMRC_5)
group_fluor Group_Fluor Concatenation of the `VMRC_Group` and `Fluor` columns
target Target The target "gene" (here taxa/strain). There are 16 unique targets in total (122010, 185329, C0006A1, C0022A1, C0028A1, C0059E1, C0112A1, C0175A1, FF00004, FF00018, FF00051, FF00064, FF00072, UC101, UC119, 16S)
target_type target_type Whether the target is a LBP strain or the 16S rRNA gene target
ext_lib_position gDNA_Plate_Well The concatenation of the gDNA plate ID and the plate well; there are 1221 unique gDNA plate wells in total in the format `[gDNA_Plate_ID]_[Well]` (e.g., 2240900_G2)
vibr_sample_id VIBR_Sample_ID VIBRANT sample ID; there are 1031 unique VIBRANT sample IDs in total)
in_redos InRedos Whether the sample was part of the samples that were redone (= re-extracted).
dilution_factor Dilution_Factor The qPCR plate specific dilution factor. There are 7 unique dilution factors in total (5, 10, 20, 30, 33.333, 133.333)
quant_adjusted Quant_Adjusted Quantification value adjusted by the dilution factor; range from 0 to 3e+08
copies_per_swab Copies_per_swab The number of target copies per swab, computed from the adjusted quantification values. Ranges from 0 to 7.5e+10
Code
qpcr <- 
  qpcr |> 
  mutate(
    in_redos = (in_redos == 1) |> replace_na(FALSE) # Convert to logical
  )

Sample types and unique sample IDs

We define a qpcr_sample_type variable to summarize information from vibr_sample_id and sample columns.

Code
qpcr <-  
  qpcr |> 
  mutate(
    qpcr_sample_type = 
      case_when(
        is.na(vibr_sample_id) & str_detect(content, "Std") ~ "Standard",
        is.na(vibr_sample_id) & str_detect(sample, "water") ~ "Water",
        str_detect(vibr_sample_id, "EQ") ~ "Control sample",
        str_detect(vibr_sample_id, "empty") ~ "Empty",
        is.na(vibr_sample_id) ~ "Water or empty",
        TRUE ~ "VIBRANT clinical sample"
      )
  )
Code
# qpcr |> dplyr::filter(target == "16S") |> arrange(qpcr_sample_type) |> View()
Code
qpcr |> dplyr::count(qpcr_sample_type, name = "n_wells") |> arrange(-n_wells) |> gt()
qpcr_sample_type n_wells
VIBRANT clinical sample 49727
Standard 4068
Control sample 2112
Empty 1890
Water 540
Water or empty 126
Code
tmp <- 
  qpcr |> 
  dplyr::filter(qpcr_sample_type == "VIBRANT clinical sample", target == target[1]) |>
  group_by(vibr_sample_id) |> 
  summarize(
    n_replicates = n(),
    n_plates = ext_lib_plate_nb |> unique() |> length(),
    any_in_redos = any(in_redos)
  )

# tmp |> dplyr::count(n_plates, any_in_redos) # All samples's replicates are on a single plate

# tmp |> dplyr::count(n_replicates, n_plates, any_in_redos) # All samples have 3 replicates

We note that almost all samples’s replicates are on a single extraction plate (ext_lib_plate_nb or ext_lib_plate_id); and almost all samples have the same number of replicates (3). A few samples (those in_redos) are on two plates and have 6 replicates.

Code
tmp <- 
  qpcr |> 
  dplyr::filter(qpcr_sample_type == "VIBRANT clinical sample", target == "16S") |>
  group_by(vibr_sample_id) |> 
  summarize(
    n_replicates = n(),
    n_plates = ext_lib_plate_nb |> unique() |> length()
  )

# tmp |> dplyr::count(n_plates) # All samples's replicates are on a single plate

# tmp |> dplyr::count(n_replicates) # All samples have 3 replicates
Code
tmp |> 
  dplyr::filter(n_replicates == 6) |> 
  dplyr::left_join(qpcr |> dplyr::filter(target == "16S"), by = join_by(vibr_sample_id)) |> 
  select(vibr_sample_id, ext_lib_plate_nb, ext_lib_position, pcr_plate_id, data_row, well, cq) |> 
  gt()

We’ll fix that below.

Code
qpcr <- 
  qpcr |> 
  mutate(
    well_col = well |> str_sub(1, 1), 
    well_row = well |> str_sub(2,3) |> as.integer()
    ) |> 
  relocate(well_col, well_row, .after = well)

We also define a unique “qPCR sample ID” according to each sample type.

Code
qpcr <- 
  qpcr |> 
  mutate(
    qpcr_sample_id = 
      case_when(
        (qpcr_sample_type == "VIBRANT clinical sample") ~ vibr_sample_id,
        (qpcr_sample_type == "Control sample") ~ vibr_sample_id,
        (qpcr_sample_type == "Empty") ~ 
          str_c(vibr_sample_id,"_plate_", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0),"_", sample),
        (qpcr_sample_type == "Water") ~ 
          str_c("water_plate", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0)),
        (qpcr_sample_type == "Water or empty") ~ 
          str_c("water_or_empty_plate", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0),"_", sample),
        (qpcr_sample_type == "Standard") ~ 
          str_c("std_", content |> str_remove("Std-"), 
                "_plate_", ext_lib_plate_nb |> str_pad(width = 2, pad = 0)),
        TRUE ~ NA_character_
      )
  ) |> 
  group_by(qpcr_sample_id, target) |>
  arrange(ext_lib_plate_nb, well_col, well_row) |>
  mutate(replicate_nb = row_number()) |> 
  ungroup() |> 
  mutate(qpcr_uid = str_c(qpcr_sample_id, "_r", replicate_nb))
Code
# qpcr |> dplyr::filter(replicate_nb > 3) |> View()

For the LBP strains, the plate layout is as follows for the first 11 extraction plates:

Code
qpcr_sample_type_colors <- c(
  "VIBRANT clinical sample" = "dodgerblue2",
  "Control sample" = "turquoise",
  "Standard" = "gold",
  "Water" = "lightblue1",
  "Empty" = "gray",
  "Water or empty" = "gray80"
)

qpcr |> 
  dplyr::filter(target != "16S", ext_lib_plate_nb <= 11) |> 
  select(ext_lib_plate_nb, well_col, well_row, qpcr_sample_type, sample) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type) +
  facet_wrap(~ ext_lib_plate_nb) +
  geom_tile() +
  geom_path(aes(group = sample), col = "black", alpha = 0.5) +
  geom_point() +
  scale_fill_manual(values = qpcr_sample_type_colors) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

For the 12th plate, the layout is a little bit different and the first PCR plate (for the first group of 3 target) is different from the 4 other:

Code
qpcr |> 
  dplyr::filter(target != "16S", ext_lib_plate_nb == 12) |> 
  select(ext_lib_plate_nb, pcr_plate_id, well_col, well_row, qpcr_sample_type, sample, strain_group_qpcr) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type) +
  facet_wrap(~ pcr_plate_id + strain_group_qpcr) +
  geom_tile() +
  geom_path(aes(group = sample), col = "black", alpha = 0.5) +
  geom_point() +
  scale_fill_manual(values = qpcr_sample_type_colors) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

For the 16S plates, the layout is a little different:

Code
qpcr |> 
  dplyr::filter(target == "16S") |> 
  select(ext_lib_plate_nb, pcr_plate_id, well_col, well_row, qpcr_sample_type, qpcr_sample_id) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type, col = qpcr_sample_type) +
  facet_wrap(~ ext_lib_plate_nb + pcr_plate_id) +
  geom_tile(alpha = 0.25) +
  geom_path(aes(group = qpcr_sample_id), alpha = 0.5, linewidth = 2) +
  geom_point() +
  scale_fill_manual(values = qpcr_sample_type_colors) +
  scale_color_manual(values = qpcr_sample_type_colors) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Lines connect replicates") 

Manifest data

We merge the qPCR data with the metagenomics manifest data to add the participant ID (pid) and (visit_code).

But before we do that, we fix the qpcr_sample_id for that sample that had 6 replicates instead of 3 in the 16S qPCR because, after discussion with Michael, we know that one of these barcode was wrong.

Code
six_replicates <- 
  qpcr |> 
  dplyr::filter(target == "16S") |> 
  group_by(qpcr_sample_id) |> 
  mutate(n_replicates = n(), none_on_plate_12 = !any(ext_lib_plate_nb == 12)) |> 
  ungroup() |> 
  dplyr::filter(n_replicates > 3, none_on_plate_12) |> 
  select(target, vibr_sample_id, qpcr_sample_id, qpcr_uid, ext_lib_position, ext_lib_plate_nb, ext_lib_plate_id, n_replicates)

six_replicates |> gt()
target vibr_sample_id qpcr_sample_id qpcr_uid ext_lib_position ext_lib_plate_nb ext_lib_plate_id n_replicates
16S 2167612 2167612 2167612_r1 2240890_E4 1 2240890 6
16S 2167612 2167612 2167612_r2 2240890_E4 1 2240890 6
16S 2167612 2167612 2167612_r3 2240890_E4 1 2240890 6
16S 2167612 2167612 2167612_r4 2240890_G5 1 2240890 6
16S 2167612 2167612 2167612_r5 2240890_G5 1 2240890 6
16S 2167612 2167612 2167612_r6 2240890_G5 1 2240890 6

We see that these 6 replicates have 2 different positions on the extraction library plate so we will be able to fix their sample ID from using extraction plate data from the technical metadata manifest using the concatenation of ext_lib_plate_id and ext_lib_position.

Code
technical_metadata_file <- 
  list.files(
    path = get_01_output_dir(), 
    pattern = "02_technical_metadata_agg_.*\\.rds$", full.names = TRUE
  ) |> 
  sort(decreasing = TRUE) |>
  extract(1)

technical_metadata <- readRDS(technical_metadata_file)

rm(technical_metadata_file)

# SE_mg <- 
#   readRDS(
#     list.files(
#       path = get_output_dir(data_source = data_source), 
#       pattern = "02_se_mg_.*\\.rds$", full.names = TRUE
#     ) |> 
#       sort(decreasing = TRUE) |>
#       extract(1) 
#   )
Code
six_replicates |> 
  mutate(ext_lib_well = ext_lib_position |> str_remove("[0-9]*_")) |> 
  dplyr::left_join(
    technical_metadata |> 
      select(swab_barcode, ext_lib_plate_nb, ext_lib_plate_id, ext_lib_position) |>
      dplyr::rename(
        ext_lib_plate_id_mg_manifest = ext_lib_plate_id,
        ext_lib_well = ext_lib_position
      )
  ) |> gt()
target vibr_sample_id qpcr_sample_id qpcr_uid ext_lib_position ext_lib_plate_nb ext_lib_plate_id n_replicates ext_lib_well swab_barcode ext_lib_plate_id_mg_manifest
16S 2167612 2167612 2167612_r1 2240890_E4 1 2240890 6 E4 2171048 2316297
16S 2167612 2167612 2167612_r2 2240890_E4 1 2240890 6 E4 2171048 2316297
16S 2167612 2167612 2167612_r3 2240890_E4 1 2240890 6 E4 2171048 2316297
16S 2167612 2167612 2167612_r4 2240890_G5 1 2240890 6 G5 2167612 2316297
16S 2167612 2167612 2167612_r5 2240890_G5 1 2240890 6 G5 2167612 2316297
16S 2167612 2167612 2167612_r6 2240890_G5 1 2240890 6 G5 2167612 2316297

One thing that is weird is that the ext_lib_plate_id in the technical metadata manifest is not the same as the one in the qPCR data:

Code
qpcr |> 
  dplyr::count(ext_lib_plate_nb, ext_lib_plate_id) |> 
  gt(caption = "Extraction library plate nb and IDs as found in the qPCR data.")
Extraction library plate nb and IDs as found in the qPCR data.
ext_lib_plate_nb ext_lib_plate_id n
1 2240890 4992
2 2240900 4992
3 2316397 4992
4 2316390 4992
5 2316420 4992
6 2316419 4992
7 2316095 4992
8 2316093 4992
9 2315117 4992
10 2316391 4992
11 2329564 4991
12 45786 45
12 2334591 153
12 NA 3354
Code
technical_metadata |> 
  dplyr::count(ext_lib_plate_nb, ext_lib_plate_id) |> 
  gt(caption = "Extraction library plate nb and IDs as found in the metagenomics technical metadata.")
Extraction library plate nb and IDs as found in the metagenomics technical metadata.
ext_lib_plate_nb ext_lib_plate_id n
1 2316297 89
2 2240891 88
3 2316307 86
4 2316306 88
5 2316310 87
6 2316311 89
7 2316094 88
8 2316097 88
9 2316392 79
10 2316393 82
11 2329564 85
12 2240891 1
12 2316094 2
12 2316097 2
12 2316297 2
12 2316306 1
12 2316307 2
12 2316310 2
12 2316311 1
12 2316392 2
12 2316393 2
NA NA 63
Warning

@Michael: What are your thoughts on this? Am I assuming they should be the same when there is no reason for them to be the same?

Regardless, since the ext_lib_plate_nb seems to match, we’ll use this to fix that 6-replicates sample. From the table above, it looks like the qpcr_sample_id for the 16S qPCR located in well E4 should be 2171048 instead of 2167612.

We fix their vibr_sample_id, qpcr_sample_id, qpcr_uid, and replicate_nb columns.

Code
qpcr <- 
  qpcr |> 
  mutate(
    needs_change = 
      (qpcr_sample_id == "2167612" & 
         target == "16S" & 
         ext_lib_plate_nb == 1 & 
         ext_lib_position == "2240890_E4"),
    needs_replicates_change = 
      (qpcr_sample_id == "2167612" & 
         target == "16S" & 
         ext_lib_plate_nb == 1 & 
         ext_lib_position == "2240890_G5"),
    qpcr_sample_id = case_when(
      needs_change ~ "2171048",
      TRUE ~ qpcr_sample_id
    ),
    vibr_sample_id = case_when(
      needs_change ~ "2171048",
      TRUE ~ vibr_sample_id
    ),
    qpcr_uid = case_when(
      needs_change ~ qpcr_uid |> str_replace("2167612", "2171048"),
      needs_replicates_change ~ qpcr_uid |> str_replace("r4", "r1") |> str_replace("r5", "r2") |> str_replace("r6", "r3"),
      TRUE ~ qpcr_uid
    ),
    replicate_nb = 
      case_when(
        needs_replicates_change ~ replicate_nb - 3L,
        TRUE ~ replicate_nb
      )
  ) |> 
  select(-needs_change, needs_replicates_change)

Now that this is fixed, we can merge the technical metadata with the qPCR data to be able to build their uid (VIBRANT cross-assay unique identifier) and identify their sample_type, control_type, etc.

Code
qpcr <- 
  qpcr |> 
  dplyr::left_join(
    technical_metadata |> 
      select(
        swab_barcode,
        uid, mg_uid, 
        mg_pid, mg_visit_code, 
        sample_type, control_type
      ),
    by = join_by("qpcr_sample_id" == "swab_barcode")
  )
Code
qpcr |>
  dplyr::filter(is.na(mg_uid)) |> 
  dplyr::count(content, vibr_sample_id) |> 
  View()
Code
samples_without_entries_in_mg_manifest <- 
  qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")) |>
  dplyr::filter(is.na(uid)) |> 
  select(qpcr_sample_id, qpcr_sample_type) |> 
  distinct()  |> 
  arrange(qpcr_sample_type, qpcr_sample_id) 

There are 1 clinical or vibrant positive/negative control samples without a matching entry in the metagenomics manifest.

Code
samples_without_entries_in_mg_manifest |> 
  # dplyr::count(qpcr_sample_type) |>
  gt()
qpcr_sample_id qpcr_sample_type
2320838 VIBRANT clinical sample

In an email from June 16th 2025, Michael indicated that this is a negative biological sample (Unused swab + C2).

We fix it manually for data completion:

Code
qpcr <- 
  qpcr |> 
  mutate(
    sample_type = 
      case_when(
        qpcr_sample_id == "2320838" ~ "Negative control",
        TRUE ~ sample_type |> as.character()
      ) |> factor(levels = levels(qpcr$sample_type)),
    control_type =
      case_when(
        qpcr_sample_id == "2320838" ~ "Unused swab + C2",
        TRUE ~ control_type |> as.character()
      ) |> factor(levels = levels(qpcr$control_type)),
    qpcr_sample_type = 
      case_when(
        qpcr_sample_id == "2320838" ~ "Control sample",
        TRUE ~ qpcr_sample_type
      )
  )

For these samples, the current uid is missing. We change it to the qpcr_sample_id when NA.

Code
qpcr <- 
  qpcr |> 
  mutate(
    uid = case_when(
      is.na(uid) & 
        (qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")) ~ 
        qpcr_sample_id,
      TRUE ~ uid
    )
  )
Code
mg_samples <- 
  technical_metadata |> 
  select(
    swab_barcode,
    uid, mg_uid, 
    mg_pid,mg_visit_code, 
    sample_type, control_type,
    ext_lib_plate_nb, no_mg_data
  ) |> 
  distinct() |> 
  dplyr::left_join(
    qpcr |> 
      select(qpcr_sample_id, qpcr_sample_type) |> 
      distinct() |> 
      dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")),
    by = join_by("swab_barcode" == "qpcr_sample_id")
  ) |> 
  arrange(uid)

# mg_samples |> dplyr::filter(is.na(qpcr_sample_type)) |> nrow()

There are 1 samples for which we have metagenomics data but no qPCR data:

Code
mg_samples |> dplyr::filter(is.na(qpcr_sample_type)) |> gt()
swab_barcode uid mg_uid mg_pid mg_visit_code sample_type control_type ext_lib_plate_nb no_mg_data qpcr_sample_type
MC1 MC1 PC_MC1 Mock1 NA Positive control Mock 1 NA FALSE NA
Code
mg_samples <- 
  technical_metadata |> 
  select(
    swab_barcode,
    uid, mg_uid, 
    mg_pid,mg_visit_code, 
    sample_type, control_type,
    ext_lib_plate_nb, no_mg_data
  ) |> 
  distinct() |> 
  dplyr::left_join(
    qpcr |>
      select(qpcr_sample_id, qpcr_sample_type, target) |> 
      distinct() |> 
      dplyr::filter(
        target == "16S",
        qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")
        ),
    by = join_by("swab_barcode" == "qpcr_sample_id")
  ) |> 
  arrange(uid)

And there are 1 samples for which we have metagenomics data but no 16S qPCR data:

Code
mg_samples |> dplyr::filter(is.na(qpcr_sample_type)) |> gt()
swab_barcode uid mg_uid mg_pid mg_visit_code sample_type control_type ext_lib_plate_nb no_mg_data qpcr_sample_type target
MC1 MC1 PC_MC1 Mock1 NA Positive control Mock 1 NA FALSE NA NA
Code
rm(mg_samples, samples_without_entries_in_mg_manifest, tmp)
Code
qpcr |> 
  dplyr::count(qpcr_sample_type, sample_type, control_type)
Code
qpcr <- 
  qpcr |> 
  mutate(
    sample_type = 
      case_when(
        is.na(sample_type) & (qpcr_sample_type == "VIBRANT clinical sample") ~ "Clinical sample",
        is.na(sample_type) & (qpcr_sample_type %in% c("Empty", "Water", "Water or empty")) ~ "Technical negative control",
        is.na(sample_type) & (qpcr_sample_type == "Standard") ~ "Standard",
        TRUE ~ sample_type
      ),
    control_type =
      case_when(
        is.na(control_type) & (qpcr_sample_type != "VIBRANT clinical sample") ~ qpcr_sample_type,
        is.na(control_type) & (qpcr_sample_type == "VIBRANT clinical sample") ~ "",
        TRUE ~ control_type
      )
  )

Target’s plate layout and fluorescent probes

The plate x Fluorescence x Target layout is as follows:

Code
qpcr |> 
  dplyr::count(fluor, target, strain_group_qpcr, pcr_plate_id, ext_lib_plate_nb) |> 
  arrange(ext_lib_plate_nb, strain_group_qpcr, pcr_plate_id, fluor) |> 
  mutate(
    target = target |> fct_inorder(), 
    plate_nb = str_c("ext. plate: ", ext_lib_plate_nb) |> fct_inorder(),
    pcr_plate_id = pcr_plate_id |> fct_inorder()
    ) |> 
  ggplot() +
  aes(
    x = pcr_plate_id,
    y = target |> factor() |> fct_rev(),
    fill = fluor
  ) +
  geom_tile() +
  facet_grid(strain_group_qpcr ~ plate_nb, scales = "free", space = "free") +
  xlab("qPCR plate ID") +
  ylab("Target") +
  scale_fill_manual(
    name = "Fluor.",
    values = c("FAM" = "dodgerblue1", "HEX" = "green3", "Cy5" = "#F8766D", "SYBR" = "green")
  ) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

Potential batch effects due to extraction should be checked for ext_lib_plate_id and those due to the qPCR itself using the pcr_plate_id.

LBP strain information

We also add the LBP strain information to the qPCR data.

Code
SE_mg <-
  readRDS(
    list.files(
      path = get_01_output_dir(),
      pattern = "02_se_mg_.*\\.rds$", full.names = TRUE
    ) |>
      sort(decreasing = TRUE) |>
      extract(1)
  )

qpcr <- 
  qpcr |> 
  dplyr::left_join(
    SE_mg |>
      rowData() |> as.data.frame() |> as_tibble() |> 
      dplyr::filter(!is.na(LBP)) |> 
      select(taxon_id, taxon_label, LBP, strain_id, strain_origin, biose_id) |> 
      dplyr::rename(target = taxon_id),
    by = join_by(target)
  )
Code
qpcr <- 
  qpcr |> 
  mutate(taxon_label = taxon_label |> str_replace_na("16S rRNA gene target"))
Code
rm(SE_mg)
Code
qpcr |> 
  arrange(strain_group_qpcr, strain_origin) |>
  mutate(target = target |> fct_inorder()) |> 
  ggplot() +
  aes(x = target, y = strain_group_qpcr, col = strain_group_qpcr) +
  geom_point() +
  facet_grid(. ~ LBP + strain_origin, scales = "free", space = "free")  +
  xlab("Target") + ylab("VMRC group") +
  guides(col = "none") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) 

Relationships between columns

Observations and calibration curves

Observed values are the cq values.

starting_quantity are computed from the cq values using the calibration curves fitted on the standard values.

\[C = f(SQ^*)\]

where \(C\) is the observed cq value for the standard, \(SQ^*\) is the known starting quantity starting_quantity for the standards, and \(f\) is the calibration function.

Note

I assume that, either the standards are not diluted, and that the quant_adjusted and copies_per_swabs are not correct for the standards, or that the standards are diluted and that the calibration curves are fitted on the quant_adjusted:

Code
qpcr |>
  dplyr::filter(qpcr_sample_type == "Standard", target != "16S") |> 
  select(
    sample, starting_quantity, dilution_factor, quant_adjusted, copies_per_swab
  ) |> 
  distinct() |> 
  arrange(sample, dilution_factor) |> 
  gt()
sample starting_quantity dilution_factor quant_adjusted copies_per_swab
std 10^1 1e+01 5 5e+01 1.25e+04
std 10^1 1e+01 10 1e+02 2.50e+04
std 10^1 1e+01 20 2e+02 5.00e+04
std 10^1 1e+01 30 3e+02 7.50e+04
std 10^1 1e+01 NA NA NA
std 10^2 1e+02 5 5e+02 1.25e+05
std 10^2 1e+02 10 1e+03 2.50e+05
std 10^2 1e+02 20 2e+03 5.00e+05
std 10^2 1e+02 30 3e+03 7.50e+05
std 10^2 1e+02 NA NA NA
std 10^3 1e+03 5 5e+03 1.25e+06
std 10^3 1e+03 10 1e+04 2.50e+06
std 10^3 1e+03 20 2e+04 5.00e+06
std 10^3 1e+03 30 3e+04 7.50e+06
std 10^3 1e+03 NA NA NA
std 10^4 1e+04 5 5e+04 1.25e+07
std 10^4 1e+04 10 1e+05 2.50e+07
std 10^4 1e+04 20 2e+05 5.00e+07
std 10^4 1e+04 30 3e+05 7.50e+07
std 10^4 1e+04 NA NA NA
std 10^5 1e+05 5 5e+05 1.25e+08
std 10^5 1e+05 10 1e+06 2.50e+08
std 10^5 1e+05 20 2e+06 5.00e+08
std 10^5 1e+05 30 3e+06 7.50e+08
std 10^5 1e+05 NA NA NA
std 10^6 1e+06 5 5e+06 1.25e+09
std 10^6 1e+06 10 1e+07 2.50e+09
std 10^6 1e+06 20 2e+07 5.00e+09
std 10^6 1e+06 30 3e+07 7.50e+09
std 10^6 1e+06 NA NA NA
std 10^7 1e+07 5 5e+07 1.25e+10
std 10^7 1e+07 10 1e+08 2.50e+10
std 10^7 1e+07 20 2e+08 5.00e+10
std 10^7 1e+07 30 3e+08 7.50e+10
std 10^7 1e+07 NA NA NA
Code
qpcr |>
  dplyr::filter(qpcr_sample_type == "Standard") |> 
  arrange(-dilution_factor) |>
  ggplot() +
  aes(
    # x = starting_quantity |> log10(), 
    x = quant_adjusted |> log10(),
    y = cq, 
    col = dilution_factor |> factor()
  ) +
  geom_point(size = 1, alpha = 0.2) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("black", "dodgerblue1"))(4)
    ) +
  facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) 

For all clinical samples, the starting_quantity is computed from the cq values using the calibration curve fitted on the standard values.

\[\hat{SQ} = f^{-1}(C)\]

where \(\hat{SQ}\) is the estimated starting quantity starting_quantity for the clinical samples, \(C\) is the observed cq value for the clinical sample, and \(f^{-1}\) is the inverse of the calibration function.

If \(C\) is not observed (i.e., assumed to be larger than the total number of cycles), then \(\hat{SQ}\) is estimated as 0.

Code
# qpcr |> 
#   ggplot() +
#   aes(x = starting_quantity |> log10(), y = cq, col = qpcr_sample_type) +
#   geom_point(size = 0.5, alpha = 0.4) +
#   facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb) 


qpcr |> 
  ggplot() +
  aes(
    x = starting_quantity |> log10(), 
    y = cq, 
    col = ext_lib_plate_nb |> factor(),
    size = (qpcr_sample_type == "Standard")
    ) +
  geom_point(alpha = 0.5) +
  ggh4x::facet_nested_wrap(vars(strain_group_qpcr, target), ncol = 6) +
  scale_size_manual("Standard", values = c(0.2, 1)) +
  scale_color_viridis_d("Extraction plate number", direction = -1, option = "A", end = 0.8) 

It looks like \(f\) is a linear function of \(\log(SQ^*)\): \(C = \beta_0 + \beta_1 \log(SQ^*)\). This seems appropriate for the LBP strains, but for the total 16S, the standard curves are not linear. Probably not too much of an issue and unlikely to create huge batch effects.

Adjusted quantities

Adjusted quantities (quant_adjusted) are then computed from the estimated starting_quantity values using the dilution factors.

\(\hat{Q} = \hat{SQ} \times d\)

where \(\hat{Q}\) is the estimated adjusted quantity (quant_adjusted), \(\hat{SQ}\) is the estimated starting quantity (starting_quantity) for the clinical samples, and \(d\) is the dilution factor.

Code
qpcr |>
  ggplot() +
  aes(
    x = starting_quantity |> log10(), 
    y = quant_adjusted |> log10(), 
    col = dilution_factor |> factor()
    ) +
  geom_point() +
  ggh4x::facet_nested_wrap(vars(strain_group_qpcr, target), ncol = 9) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(6)
    )

quant_adjusted = starting_quantity x dilution_factor:

Code
qpcr |>
  ggplot() +
  aes(
    x = (starting_quantity * dilution_factor) |> log10(), 
    y = (quant_adjusted) |> log10(), 
    col = dilution_factor |> factor()
    ) +
  geom_point() +
  ggh4x::facet_nested_wrap(vars(strain_group_qpcr, target), ncol = 9) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(6)
    )

Copies per swab

copies_per_swab are the adjusted_quant multiplied by a constant factor estimated to be the number of bacteria in 1 concentration unit.

Here, that number is 250 for all LBP targets.

Code
# strain_group_qpcr + target

qpcr |>
  dplyr::filter(target != "16S") |> 
  ggplot() +
  aes(
    x = (quant_adjusted * 250) |> log10(), 
    y = copies_per_swab |> log10(), 
    col =  target
    ) +
  geom_abline(intercept = 0, slope = 1, col = "gray") +
  geom_point(size = 0.5, alpha = 0.4) +
  facet_wrap(. ~ ext_lib_plate_nb , ncol = 6) 

For the 16S target, that number is 500, except for plate 12, where it is 250.

Code
qpcr |>
  dplyr::filter(target == "16S") |> 
  mutate(
    r = copies_per_swab / (quant_adjusted )
  ) |> 
  pull(r) |> 
  hist()

This was actually a mistake and it should be 250 for all plates for the 16S target.

We fix this here:

Code
qpcr <- 
  qpcr |> 
  mutate(
    copies_per_swab = 
      case_when(
        (qpcr_sample_type != "Standard") & 
          (target == "16S") & 
          (ext_lib_plate_nb < 12) ~ 
          copies_per_swab / 2,
        TRUE ~ 
          copies_per_swab
      )
  )

Dilution factors

Dilution factors are defined per qPCR plate (and consequently, are the same for the 3 targets in the same VMRC group)

Code
qpcr |> 
  dplyr::count(target, dilution_factor) 

qpcr |> 
  group_by(pcr_plate_id, target) |>
  summarize(n_dilution_factors = dilution_factor |> unique() |> length()) 
Code
#|

qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    dilution_factor
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = dilution_factor |> factor()
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(6)
    ) +
  xlab("Well column") + ylab("Well row") 

Exploratory & QC analyses

CQ and Copies per swabs per well

Code
qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    target,
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    cq
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq
    ) +
  geom_tile() +
  ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Code
qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    target,
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    copies_per_swab
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = copies_per_swab |> asinh()
    ) +
  geom_tile() +
  ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Warning

From these visualizations of the raw data, something looks weird with the cq and copies_per_swab values for the first 3 plates of “VMRC group 1”.

Total number of copies per swab per sample type

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = target, y = copies_per_swab, col = qpcr_sample_type, fill = qpcr_sample_type) +
  geom_boxplot(alpha = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ target_type, scales = "free", space = "free")

g 

Code
g + scale_y_log10()

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = target, y = copies_per_swab, col = qpcr_sample_type, fill = qpcr_sample_type) +
  geom_boxplot(alpha = 0.5) +
  facet_grid(qpcr_sample_type ~ ., scales = "free") +
  guides(fill = "none", color = "none") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  ) +
  facet_grid(. ~ target_type, scales = "free", space = "free")

g 

Code
g + scale_y_log10()

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = ext_lib_plate_nb |> factor(), y = copies_per_swab, col = target, fill = target) +
  geom_boxplot(alpha = 0.5) +
  facet_grid(qpcr_sample_type ~ ., scales = "free") +
  xlab("Extraction plate number") +
  theme(
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90, hjust = 0)
  )

g 

Code
g + scale_y_log10() 

Empty and water samples

From the plots above, it looks like the empty and water samples have non-zero values on a few LBP plates.

Code
plot_qpcr_empties <- function(qpcr, color_by = "qpcr_sample_type") {
  qpcr |> 
    dplyr::filter(qpcr_sample_type %in% c("Empty", "Water")) |> 
    group_by(sample, pcr_plate_id, ext_lib_plate_nb, target) |>
    mutate(replicate_nb = row_number()) |> 
    ungroup() |> 
    ggplot() +
    aes(x = sample, y = copies_per_swab, label = replicate_nb, col = !!sym(color_by)) +
    geom_text(size = 3) +
    ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free_x", space = "free_x") +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      strip.text.y = element_text(angle = 0)
    ) 
}
Code
plot_qpcr_empties(qpcr, color_by = "qpcr_sample_type") 

Code
plot_qpcr_empties(qpcr |> mutate(zero_copies = (copies_per_swab == 0)), color_by = "zero_copies") 

Code
prob_target <-
  qpcr |> 
  select(target) |> 
  distinct() |> 
  mutate(prob_target = (target %in% c("C0022A1", "C0059E1", "C0175A1")))
Warning

We see the same issues as above for the first 3 plates of VMRC group 1.

Control samples

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("Control sample")) |> 
  group_by(sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(replicate_nb = row_number()) |> 
  ungroup() |> 
  mutate(
    control_type = control_type |> str_replace_na("? Unknown (not in MG manifest)"),
    vibr_sample_id = vibr_sample_id |> fct_reorder(control_type),
    plate_nb = str_c("ext. plate\n", ext_lib_plate_nb) |> fct_reorder(ext_lib_plate_nb),
    ) |> 
  ggplot() +
  aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = control_type) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("Control type") +
  ggh4x::facet_nested(cols = vars(plate_nb), rows = vars(strain_group_qpcr, target), scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    # strip.text.x = element_text(angle = 90),
    strip.text.y = element_text(angle = 0)
  ) 

These look consistent with expectations, except, again, for the first 3 plates of VMRC group 1.

Standards

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% "Standard") |>
  ggplot() +
  aes(x = starting_quantity, y = cq, col = replicate_nb |> factor()) +
  geom_point(alpha = 0.5, size = 0.5) +
  ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free_x", space = "free_x") +
  scale_x_log10() +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

For the 16S, there seems to be “reverse saturation” (i.e., the cq is lower than expected with a linear relationship). Maybe the dilution was not that reliable at such low concentrations.

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% "Standard") |>
  mutate(std_nb = content |> str_remove("Std-") |> parse_number()) |> 
  ggplot() +
  aes(x = starting_quantity, y = cq, col = dilution_factor |> factor()) +
  geom_path(aes(group = interaction(ext_lib_plate_nb, replicate_nb)), alpha = 0.2) +
  geom_text(alpha = 0.2, aes(label = std_nb)) +
  ggh4x::facet_nested_wrap(vars(strain_group_qpcr, target), ncol = 9) +
  scale_x_log10() +
  scale_color_manual(
    name = "Dilution factor",
    breaks = c(5,10,20,30),
    values = c("black", "deeppink3", "deeppink1", "pink")
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

It looks like standard 4 was not prepared or placed properly for VMRC group 4.

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% "Standard") |>
  mutate(std_nb = content |> str_remove("Std-") |> parse_number()) |> 
  ggplot() +
  aes(x = starting_quantity, y = cq, col = ext_lib_plate_nb |> factor()) +
  geom_path(aes(group = interaction(ext_lib_plate_nb, replicate_nb)), alpha = 0.2) +
  geom_text(alpha = 0.2, aes(label = std_nb)) +
  ggh4x::facet_nested_wrap(vars(strain_group_qpcr, target), ncol = 9) +
  scale_x_log10() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

This shows again the issues with the first 3 plates of VMRC group 1.

Clinical samples

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(vibr_sample_id, sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab, na.rm = TRUE),
    mean_cps = mean(copies_per_swab, na.rm = TRUE)
    ) |> 
  ungroup() |> 
  mutate(vibr_sample_id = vibr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = replicate_nb |> factor()) +
  geom_line(aes(group = vibr_sample_id), col = "black", linewidth = 0.1) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Samples", breaks = NULL) +
  ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

We notice again the same issues with the VRMC group 1 plate 1-3.

Warning

We could simply discard these values (exclude these targets x plates) or try to identify a new threshold to differentiate between background noise and actual signal.

Replicates

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(qpcr_sample_id, vibr_sample_id, target) |>
  mutate(
    n_replicates = n(),
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab, na.rm = TRUE),
    mean_cps = mean(copies_per_swab, na.rm = TRUE),
    min_ext_lib_plate_nb = min(ext_lib_plate_nb, na.rm = TRUE),
    ) |> 
  ungroup() |> 
  dplyr::filter(n_replicates > 3) |>
  mutate(vibr_sample_id = vibr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = ext_lib_plate_nb |> factor(), y = copies_per_swab, label = replicate_nb, col = (ext_lib_plate_nb == 12)) +
  ggh4x::facet_nested(cols = vars(min_ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free_x", space = "free_x") +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Extraction plate") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    strip.text.y = element_text(angle = 0),
  )

Agreement between pairs of triplicates is good except for the samples that initially were on the first 3 plates for VMRC group 1.

New detection thresholds ?

The idea is to use the negative controls cq values to define plate- and target-specific “detection” thresholds, then use these thresholds to set to 0 the copies_per_swab values for which the cq values are above the threshold.

The figures below show the cq values for the VMRC group 1 and 2.

Code
qpcr |>
  dplyr::filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2")) |>
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq
    ) +
  geom_tile() +
  ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

We impute the missing cq values to 45 (well above the maximum observed value, and what I assume was the total number of cycles).

Code
qpcr <- qpcr |> mutate(cq_imp = cq |> replace_na(45)) 
Code
qpcr |>
  dplyr::filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2")) |>
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq_imp
    ) +
  geom_tile() +
  ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Comparing the cq values per sample type

Code
qpcr <- 
  qpcr |>
  mutate(
    simplified_type = 
      case_when(
        str_detect(control_type, "Mock") ~ "Positive control",
        (qpcr_sample_type == "Control sample") & is.na(sample_type) ~ "Unknown control",
        qpcr_sample_type %in% c("Control sample", "Empty", "Water") ~ "Negative control",
        TRUE ~ qpcr_sample_type
      ) |> 
      fct_infreq(),
    qpcr_sample_type = qpcr_sample_type |> fct_infreq()
    ) 

# qpcr |> dplyr::count(simplified_type, sample_type, control_type, qpcr_sample_type)

We define the thresholds as the average over the two smallest cq value for the “empties” (i.e., empty and water samples, as well as the negative controls).

Code
thresholds <- 
  qpcr |> 
  dplyr::filter(target != "16S") |> 
  group_by(target, strain_group_qpcr, ext_lib_plate_nb) |> 
  summarize(
    min_cq_empties = min(cq_imp[simplified_type == "Negative control"], na.rm = TRUE),
    robust_min_cq_empties = cq_imp[simplified_type == "Negative control"] |> sort() |> extract(1:2) |> mean(),
    max_cq_std = max(cq_imp[simplified_type == "Standard"], na.rm = TRUE),
    .groups = "drop"
  ) |> 
  mutate(
    threshold_cq = robust_min_cq_empties
  )
Code
qpcr |> 
  dplyr::filter(
    strain_group_qpcr %in% c("VMRC_1", "VMRC_2"), 
    ext_lib_plate_nb <= 6,
    simplified_type %in% c("VIBRANT clinical sample", "Standard", "Negative control") 
    ) |> 
  arrange(simplified_type) |> 
  ggplot() +
  aes(x = simplified_type |> str_wrap(width = 15), y = cq_imp, col = qpcr_sample_type) + #  
  geom_violin(col = "gray") +
  geom_jitter(height = 0, width = 0.25, alpha = 0.5, size = 0.75) +
  geom_hline(
    data = thresholds |> dplyr::filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2"), ext_lib_plate_nb <= 6), 
    aes(yintercept = threshold_cq), col = "steelblue1"
    ) +
  facet_grid(target ~ ext_lib_plate_nb) +
  scale_color_manual(values = c("steelblue1", "black", "dodgerblue3", "red","pink") |> rev()) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) +
  xlab("Sample type") 

Code
qpcr <- 
  qpcr |> 
  select(-any_of(c("min_cq_empties","robust_min_cq_empties", "max_cq_std", "threshold_cq"))) |> 
  dplyr::left_join(thresholds, by = join_by(ext_lib_plate_nb, strain_group_qpcr, target)) 

For all plates and targets

Code
qpcr |>
  dplyr::filter(qpcr_sample_type == "VIBRANT clinical sample") |> 
  ggplot() +
  aes(x = cq_imp, y = target) +
  geom_violin(fill = "gray90", col = "transparent") +
  geom_jitter(width = 0, size = 0.1, col = "pink") +
  geom_vline(aes(xintercept = min_cq_empties), col = "dodgerblue") +
  geom_vline(aes(xintercept = robust_min_cq_empties), col = "green3") +
  geom_vline(aes(xintercept = max_cq_std), col = "red") +
  ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free_y", space = "free_y") +
  theme(strip.text.y = element_text(angle = 0)) +
  labs(
    x = "cq value (imputed to 45 when not avalaible)",
    caption = 
      str_c(
        "Pink dots are the cq values for the VIBRANT clinical samples\n",
        "The blue lines are the minimum cq value for the negative controls\n",
        "The green lines are the average of the two smallest cq values for the negative controls (= the detection threshold)\n",
        "The red lines are the maximum cq value for the standards"
      )
  )

Adjusted copies per swabs

We create a new column copies_per_swab_adj that is set to 0 if the cq_imp is above the threshold.

Code
qpcr <- 
  qpcr |> 
  mutate(
    copies_per_swab_adj = 
      case_when(
        (target != "16S") & (cq_imp >= threshold_cq) ~ 0,
        TRUE ~ copies_per_swab
      )
  )
Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(qpcr_sample_id, sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab_adj, na.rm = TRUE),
    mean_cps = mean(copies_per_swab_adj, na.rm = TRUE)
    ) |> 
  ungroup() |> 
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = copies_per_swab_adj, label = replicate_nb, col = replicate_nb |> factor()) +
  geom_line(aes(group = qpcr_sample_id), col = "black", linewidth = 0.1) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Samples", breaks = NULL) +
  ggh4x::facet_nested(cols = vars(ext_lib_plate_nb), rows = vars(strain_group_qpcr, target), scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

This does not fully solve the problem, but I hope that at the “aggregation” step, taking the median value would remove a lot of false positive.

If not, a more stringent threshold could potentially be used, or these samples would have to be excluded from the analysis.

Aggregation across replicates

Code
qpcr <- 
  qpcr |> 
  group_by(qpcr_sample_id, target) |> 
  mutate(
    copies_per_swab_raw_median = median(copies_per_swab, na.rm = TRUE),
    copies_per_swab_adj_median = median(copies_per_swab_adj, na.rm = TRUE),
    copies_per_swab_adj_range = max(copies_per_swab_adj, na.rm = TRUE) - min(copies_per_swab_adj, na.rm = TRUE),
  ) |> 
  ungroup()
Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
  group_by(qpcr_sample_id) |> mutate(n_target_detected = sum(copies_per_swab_raw_median > 0)) |> ungroup() |>
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = target, fill = copies_per_swab_raw_median |> log10()) +
  geom_tile() +
  facet_grid(strain_group_qpcr + LBP  ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_gradient(low = "dodgerblue1", high = "black", na.value = "white") +
  theme(
    axis.text.x = element_blank(),
    legend.position = "bottom"
  ) +
  ggtitle("Median of the original `copies_per_swabs`")

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
  group_by(qpcr_sample_id) |> mutate(n_target_detected = sum(copies_per_swab_adj_median > 0)) |> ungroup() |>
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = target, fill = copies_per_swab_adj_median |> log10()) +
  geom_tile() +
  facet_grid(strain_group_qpcr + LBP  ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_gradient(low = "dodgerblue1", high = "black", na.value = "white") +
  theme(
    axis.text.x = element_blank(),
    legend.position = "bottom"
  ) +
  ggtitle("Median of the adjusted `copies_per_swabs`")

The threshold correction likely helped in reducing the number of false positive.

Longitudinal profiles

When looking at the longitudinal patterns for randomized participants, we do not notice an over-representation of a specific plate or group in the “isolated” positives:

Code
qpcr |> 
  dplyr::filter(
    sample_type %in% c("Clinical sample"), 
    # sample_category %in% c("Expected sample"),
    mg_visit_code %in% c(seq(1000, 1900, by = 100), 2120)
  ) |>
  group_by(mg_pid) |> mutate(n_target_detected = sum(copies_per_swab_adj_median[!is.na(LBP)] > 0)) |> ungroup() |>
  mutate(mg_pid = mg_pid |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(
    x = mg_pid, y = mg_visit_code, 
    fill = ext_lib_plate_nb |> factor(), 
    alpha = copies_per_swab_adj_median |> log10()
  ) +
  geom_tile() +
  ggh4x::facet_nested(rows = vars(strain_group_qpcr, target), scales = "free", space = "free") +
  scale_fill_discrete("Extraction plate number") +
  scale_alpha_continuous(limits = c(0, 10)) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    strip.text.y = element_text(angle = 0),
    legend.position = "bottom"
  )

We should thus expect some false positive in the LBP qPCR data :)

Code
qpcr |> 
  dplyr::filter(
    target != "16S",
    sample_type %in% c("Clinical sample"), 
    mg_visit_code %in% c(0, seq(1000, 1900, by = 100), 2120)
  ) |>
  # group_by(mg_pid) |> mutate(n_target_detected = sum(copies_per_swab_adj_median[!is.na(LBP)] > 0)) |> ungroup() |>
  # mutate(mg_pid = mg_pid |> fct_reorder(n_target_detected)) |>
  mutate(mg_pid = mg_pid |> factor()) |>
  mutate(target = target |> fct_reorder(as.numeric(factor(LBP)) + as.numeric(factor(strain_origin))/10)) |> 
  ggplot() +
  aes(
    x = target, y = mg_pid |> fct_rev(), 
    fill = LBP,
    alpha = copies_per_swab_adj_median |> log10()
  ) +
  geom_tile() +
  facet_grid(. ~ mg_visit_code) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    # strip.text.y = element_text(angle = 0),
    legend.position = "bottom"
  )

There are clear patterns in the data, hinting at the participants’ arms :) However, this also suggests that there might be quite a lot of false positive and a low specificity for the qPCR probes.

Code
qpcr |> 
  dplyr::filter(
    sample_type %in% c("Clinical sample"), 
    # sample_category %in% c("Expected sample"),
    mg_visit_code %in% c(seq(1000, 1900, by = 100), 2120),
    target == "16S"
    ) |>  
  arrange(mg_pid, mg_visit_code) |> 
  select(mg_pid, mg_visit_code, copies_per_swab_adj_median) |>
  distinct() |> 
  ggplot() +
  aes(x = mg_visit_code, y = copies_per_swab_adj_median, col = mg_pid) +
  geom_boxplot(alpha = 0.2, fill = "dodgerblue1", color = "dodgerblue1") +
  geom_line(aes(group = mg_pid), alpha = 0.2) +
  # geom_point(size = 0.5) +
  ggbeeswarm::geom_quasirandom(width = 0.2) +
  scale_y_log10() +
  guides(col = "none") +
  ggtitle("16S copies per swab") 

Specificity and sensitivity estimation from Mock positive controls and from screening visits samples

In this section, we use the biological controls (Mock samples) to estimate the sensitivity (true positive rate) of the qPCR assays and the screening visit samples to estimate the specificity (false positive rate).

Our true positive samples are the Mock samples since they contain, among other strains, the 15 LBP strains.

The relative abundance of each LBP strains in Mock 1 samples should be around 1/100, while in Mock 2 samples, it should be around 1/24.

Unfortunately, we do not have a good biological negative sample (e.g., a pool of non-LBP strains), so we use the screening visit samples as our “true negative samples”, acknowledging that some of these samples might contain native L. crispatus that may be very closely related to one of the LBP strains.

Code
tmp <- 
  qpcr |> 
  dplyr::filter(
    (sample_type == "Positive control") | ((mg_visit_code <= 1000) & (sample_type == "Clinical sample"))
  ) |> 
  select(
    uid, mg_visit_code, sample_type, control_type, ext_lib_plate_nb, target, strain_group_qpcr, LBP, strain_origin, copies_per_swab_adj_median
  ) |> 
  arrange(uid) |> 
  distinct() |> 
  mutate(
    truth = 
      case_when(
        str_detect(control_type, "Mock") ~ "P",
        (mg_visit_code <= 1000) ~ "N",
        TRUE ~ NA_character_
      ),
    predicted = 
      case_when(
        copies_per_swab_adj_median > 0 ~ "PP",
        TRUE ~ "PN"
      )
  )
Code
tmp |> 
  ggplot() +
  aes(
    x = sample_type, y = copies_per_swab_adj_median |> asinh(), 
    col = str_c(sample_type, " ",control_type |> as.character(), " (", truth, ")")
    ) +
  ggh4x::facet_nested(cols = vars(strain_group_qpcr, target)) +
  scale_color_discrete("Control type") + xlab("") +
  ggbeeswarm::geom_quasirandom(size = 0.7, alpha = 0.8) +
  theme(
    #strip.text.x = element_text(angle = 90),
    axis.text.x = element_blank()
  )

Code
tmp <- 
  tmp |> 
  dplyr::filter(target != "16S") |> 
  dplyr::left_join(
    tmp |> 
      dplyr::filter(target == "16S") |> 
      select(uid, copies_per_swab_adj_median) |> 
      dplyr::rename(copies_per_swab_adj_median_16S = copies_per_swab_adj_median),
    by = join_by(uid)
  )
Code
tmp <- 
  tmp |> 
  mutate(
    exp_rel_ab = 
      case_when(
        (control_type == "Mock 1") ~ 1 / 100,
        (control_type == "Mock 2") ~ 1 / 24,
        TRUE ~ 0
      ),
    obs_rel_ab = copies_per_swab_adj_median / copies_per_swab_adj_median_16S
  )
Code
g <- 
  tmp |> 
  ggplot() +
  aes(
    x = str_c(LBP, strain_origin, target, sep = " - "), y = obs_rel_ab, 
    col = str_c(sample_type, " ", control_type, " (", truth, ")")
  ) +
  facet_grid(.  ~ sample_type + control_type) + # , scales = "free"
  geom_hline(aes(yintercept = exp_rel_ab), col = "gray50") +
  geom_boxplot(alpha = 0, outlier.shape = NA) +
  geom_point(alpha = 0.3) +
  scale_color_discrete("Control type") + 
  ylab("Observed relative abundance") + 
  xlab("LBP strain") + 
  labs(
    caption = "The horizontal lines are the expected relative abundance of each LBP strain in the control samples."
  ) +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
  )

g

Code
g + ylim(c(0, 0.25)) 

Code
g + ylim(c(0, 0.05)) 

Code
tmp |> 
  group_by(target, strain_group_qpcr, LBP, strain_origin) |>
  summarize(
    `n True Negatives` = sum(truth == "N" & predicted == "PN"),
    `n Negatives` = sum(truth == "N"),
    specificity = `n True Negatives` / `n Negatives`,
    `n True Positives` = sum(truth == "P" & predicted == "PP"),
    `n Positives` = sum(truth == "P"),
    sensitivity = sum(truth == "P" & predicted == "PP") / sum(truth == "P"),
    .groups = "drop"
  ) |> 
  arrange(strain_group_qpcr, target) |> 
  gt(caption = "Specificity and sensitivity of the qPCR assays for the LBP targets") |>
  cols_label(
    specificity ~ "Specificity (%) [likely underestimated]",
    sensitivity ~ "Sensitivity (%)",
    strain_origin ~ "Strain origin",
    strain_group_qpcr ~ "qPCR group",
    target ~ "Target (LBP strain)"
  ) |>
  fmt_number(
    columns = c(specificity, sensitivity),
    decimals = 2,
    scale_by = 100
  ) 
Specificity and sensitivity of the qPCR assays for the LBP targets
Target (LBP strain) qPCR group LBP Strain origin n True Negatives n Negatives Specificity (%) [likely underestimated] n True Positives n Positives Sensitivity (%)
C0022A1 VMRC_1 LC-106 & LC-115 US 290 296 97.97 21 22 95.45
C0059E1 VMRC_1 LC-106 & LC-115 US 274 296 92.57 21 22 95.45
C0175A1 VMRC_1 LC-106 & LC-115 US 281 296 94.93 21 22 95.45
FF00018 VMRC_2 LC-106 & LC-115 SA 286 296 96.62 21 22 95.45
FF00051 VMRC_2 LC-106 & LC-115 SA 278 296 93.92 21 22 95.45
UC101 VMRC_2 LC-106 & LC-115 SA 284 296 95.95 21 22 95.45
185329 VMRC_3 LC-115 SA 296 296 100.00 15 22 68.18
C0028A1 VMRC_3 LC-115 US 294 296 99.32 13 22 59.09
C0112A1 VMRC_3 LC-115 US 294 296 99.32 21 22 95.45
122010 VMRC_4 LC-115 SA 284 296 95.95 21 22 95.45
C0006A1 VMRC_4 LC-115 US 291 296 98.31 21 22 95.45
FF00004 VMRC_4 LC-115 SA 293 296 98.99 21 22 95.45
FF00064 VMRC_5 LC-115 SA 291 296 98.31 21 22 95.45
FF00072 VMRC_5 LC-115 SA 295 296 99.66 21 22 95.45
UC119 VMRC_5 LC-115 SA 296 296 100.00 21 22 95.45
Code
tmp |> 
  dplyr::filter(sample_type == "Clinical sample") |> 
  mutate(
    site = case_when(
      str_detect(uid, "06810") ~ "MGH",
      str_detect(uid, "06820") ~ "CAP",
      TRUE ~ "Other"
    )
  ) |> 
  ggplot() +
  aes(
    x = site, y = copies_per_swab_adj_median |> asinh(), 
    col = site
    ) +
  ggh4x::facet_nested(cols = vars(strain_group_qpcr, target)) +
  scale_color_manual("Study site", values = site_colors) + 
  ggbeeswarm::geom_quasirandom(size = 0.7, alpha = 0.8) +
  theme(
    #strip.text.x = element_text(angle = 90),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )

It looks like there might be a mild correlation between strain origin and study site for screening visit samples.

Creating SummarizedExperiment objects

We create two Summarized Experiment objects:

  • one with all values as provided in the original file, where each “sample” is a plate well and each feature is a target (LBP taxa)

  • one with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including “biological” control samples) and each feature is a target (LBP taxa)

Raw SummarizedExperiment object

We create a SummarizedExperiment object with the following assays:

  • dilution_factor
  • cq
  • starting_quantity
  • cq_mean
  • quant_adjusted
  • copies_per_swab_raw (current copies_per_swab column)
  • copies_per_swab (current copies_per_swab_adj column)

The “samples” are defined by the qpcr_uid column (one per well and plate) and the “features” are defined by the target column (i.e., the LBP taxa).

The colData contains the following columns:

  • qpcr_uid (the qPCR specific unique identifier)
  • uid (the VIBRANT cross-assay unique sample identifier)
  • qpcr_sample_id
  • vibr_sample_id
  • qpcr_sample_type
  • sample_type
  • control_type
  • ext_lib_plate_nb
  • ext_lib_plate_id

The rowData contains the following columns:

  • fluor
  • strain_group_qpcr
  • pcr_plate_id (the concatenation of pcr_plate_id for each ext_lib_plate_nb)
  • taxon_label
  • LBP
  • strain_id
  • strain_origin
  • biose_id
Code
qpcr |> dplyr::count(qpcr_uid) |> dplyr::count(n)

qpcr |> dplyr::count(qpcr_uid) |> nrow()


qpcr |> dplyr::count(qpcr_uid) |> dplyr::filter(n != 16) |> View()

qpcr |> 
  select(
    qpcr_uid, 
    uid, mg_pid, mg_visit_code,
    qpcr_sample_type,  sample_type, control_type, 
    qpcr_sample_id, vibr_sample_id, replicate_nb, 
    ext_lib_plate_nb, ext_lib_plate_id
  ) |> 
  distinct() |> 
  group_by(qpcr_uid) |> mutate(n = n()) |> ungroup() |> 
  dplyr::filter(n > 1) |> 
  arrange(qpcr_uid)

qpcr |> 
  dplyr::filter(qpcr_sample_id == "1588") |> 
  select(qpcr_sample_id, qpcr_uid, pcr_plate_id, well_row, well_col, ext_lib_plate_nb, ext_lib_plate_id, target) |> 
  arrange(target, pcr_plate_id,  qpcr_uid, well_row, well_col) |> 
  View()
Code
make_raw_qpcr_SE <- function(qpcr){
  
  qpcr <- 
    qpcr |> 
    arrange(ext_lib_plate_nb, well_col, well_row) |>
    mutate(qpcr_uid = qpcr_uid |> fct_inorder()) |> 
    arrange(strain_group_qpcr, fluor) |> 
    mutate(target = target |> fct_inorder()) |> 
    arrange(qpcr_uid, target)
    
  # ASSAYS
  
  dilution_assay <- 
    qpcr |>
    select(qpcr_uid, target, dilution_factor) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = dilution_factor) |> 
    as.data.frame() |> 
    column_to_rownames("target") 
  
  cq_assay <- 
    qpcr |>
    select(qpcr_uid, target, cq) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = cq) |> 
    as.data.frame() |> 
    column_to_rownames("target") 
  
  starting_quantity_assay <- 
    qpcr |>
    select(qpcr_uid, target, starting_quantity) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = starting_quantity) |> 
    as.data.frame() |> 
    column_to_rownames("target")
  
   cq_mean_assay <- 
    qpcr |>
    select(qpcr_uid, target, cq_mean) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = cq_mean) |> 
    as.data.frame() |> 
    column_to_rownames("target")
   
   quant_adjusted_assay <- 
     qpcr |>
     select(qpcr_uid, target, quant_adjusted) |> 
     arrange() |> 
     pivot_wider(names_from = qpcr_uid, values_from = quant_adjusted) |> 
     as.data.frame() |> 
     column_to_rownames("target")
   
   copies_per_swab_raw_assay <- 
     qpcr |>
     select(qpcr_uid, target, copies_per_swab) |> 
     arrange() |> 
     pivot_wider(names_from = qpcr_uid, values_from = copies_per_swab) |> 
     as.data.frame() |> 
     column_to_rownames("target")
   
  copies_per_swab_assay <- 
    qpcr |>
    select(qpcr_uid, target, copies_per_swab_adj) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = copies_per_swab_adj) |> 
    as.data.frame() |> 
    column_to_rownames("target")
  
    # COLDATA
   coldata <- 
     qpcr |> 
     select(
       qpcr_uid, 
       uid, mg_pid, mg_visit_code,
       qpcr_sample_type,  sample_type, control_type, 
       qpcr_sample_id, vibr_sample_id, replicate_nb, 
       ext_lib_plate_nb #, ext_lib_plate_id
       ) |> 
     distinct() |> 
     arrange(qpcr_uid) |> 
     as.data.frame() |> 
     mutate(rownames = qpcr_uid) |> 
     column_to_rownames("rownames") 
  

    # ROWDATA
   rowdata <-
     qpcr |> 
     select(
       target, fluor, strain_group_qpcr, 
       pcr_plate_id, ext_lib_plate_nb, 
       taxon_label, LBP, strain_id, strain_origin, biose_id
       ) |> 
     distinct() |> 
     group_by(
       target, fluor, strain_group_qpcr, 
       taxon_label, LBP, strain_id, strain_origin, biose_id
       ) |> 
     arrange(target, ext_lib_plate_nb) |> 
     mutate(
       pcr_plate_id = str_c(pcr_plate_id, " (extr. plate ", ext_lib_plate_nb |> str_pad(width = 2, pad = "0"), ")", sep = ""),
     ) |> 
     summarize(
       pcr_plate_ids = pcr_plate_id |> unique() |> str_c(collapse = ", "),
       .groups = "drop"
     ) |> 
     ungroup() |> 
     as.data.frame() |> 
     mutate(rownames = target) |> 
     column_to_rownames("rownames")

  
   
   SE <- SummarizedExperiment(
     assays = list(
       dilution = dilution_assay |> as.matrix(),
       cq = cq_assay |> as.matrix(),
       cq_mean = cq_mean_assay |> as.matrix(),
       starting_quantity = starting_quantity_assay |> as.matrix(),
       quant_adjusted = quant_adjusted_assay |> as.matrix(),
       copies_per_swab_raw = copies_per_swab_raw_assay |>  as.matrix(),
       copies_per_swab = copies_per_swab_assay |> as.matrix()
     ),
     colData = coldata,
     rowData = rowdata,
     metadata = list(
       description = "Raw qPCR data from the VIBRANT study",
       date = today(),
       assay_and_coldata_dictionary = dictionary
     )
   )
   
   SE
  
}
Code
SE_qPCR_raw <- make_raw_qpcr_SE(qpcr)
SE_qPCR_raw
# A SummarizedExperiment-tibble abstraction: 63,312 × 29
# Features=16 | Samples=3957 | Assays=dilution, cq, cq_mean, starting_quantity,
#   quant_adjusted, copies_per_swab_raw, copies_per_swab
   .feature .sample      dilution    cq cq_mean starting_quantity quant_adjusted
   <chr>    <chr>           <dbl> <dbl>   <dbl>             <dbl>          <dbl>
 1 C0175A1  std_02_plat…       10  18.5    18.5           1000000       10000000
 2 C0022A1  std_02_plat…       10  19.3    19.3           1000000       10000000
 3 C0059E1  std_02_plat…       10  18.8    18.8           1000000       10000000
 4 UC101    std_02_plat…       10  17.3    17.2           1000000       10000000
 5 FF00018  std_02_plat…       10  18.2    18.1           1000000       10000000
 6 FF00051  std_02_plat…       10  18.1    18.1           1000000       10000000
 7 185329   std_02_plat…       30  17.1    17.1           1000000       30000000
 8 C0028A1  std_02_plat…       30  17.2    17.2           1000000       30000000
 9 C0112A1  std_02_plat…       30  17.8    17.8           1000000       30000000
10 FF00004  std_02_plat…       10  18.0    18.1           1000000       10000000
# ℹ 310 more rows
# ℹ 22 more variables: copies_per_swab_raw <dbl>, copies_per_swab <dbl>,
#   qpcr_uid <fct>, uid <chr>, mg_pid <chr>, mg_visit_code <chr>,
#   qpcr_sample_type <fct>, sample_type <chr>, control_type <chr>,
#   qpcr_sample_id <chr>, vibr_sample_id <chr>, replicate_nb <int>,
#   ext_lib_plate_nb <dbl>, target <fct>, fluor <chr>, strain_group_qpcr <chr>,
#   taxon_label <chr>, LBP <fct>, strain_id <fct>, strain_origin <fct>, …

Aggregated SummarizedExperiment object

We create a SummarizedExperiment object with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including control samples) and each feature is a target (LBP taxa), with the following assays

  • dilution
  • copies_per_swab_med (the median copies_per_swab across replicates)
  • copies_per_swab_mean (the mean copies_per_swab across replicates)
  • copies_per_swab_cv (the coefficient of variation of copies_per_swab across replicates)

The colData contains the same columns as above (except for those that are replicate-specific)

The rowData contains the same columns as above,

Code
agg_qpcr_SE <- function(SE_qPCR_raw){
  
  filtered_qpcr <- 
    SE_qPCR_raw |> 
    dplyr::filter(
      !(qpcr_sample_type %in% c("Standard", "Water", "Empty")),
      !is.na(uid) 
      ) 
    
  summarized_qpcr <- 
    filtered_qpcr |>
    as_tibble() |> 
    group_by(uid, target) |> 
    summarize(
      n_non_na = sum(!is.na(copies_per_swab)),
      dilution = mean(dilution, na.rm = TRUE),
      copies_per_swab_med = median(copies_per_swab, na.rm = TRUE),
      copies_per_swab_mean = mean(copies_per_swab, na.rm = TRUE),
      copies_per_swab_sd = sd(copies_per_swab, na.rm = TRUE),
      copies_per_swab_range = max(copies_per_swab, na.rm = TRUE) - min(copies_per_swab, na.rm = TRUE),
      .groups = "drop"
    ) |> 
    mutate(
      copies_per_swab_cv = ifelse(copies_per_swab_sd == 0, 0, copies_per_swab_sd / copies_per_swab_mean)
    ) 

  # COLDATA
  coldata <- 
    filtered_qpcr |> 
    colData() |> 
    as.data.frame() |> 
    as_tibble() |> 
    select(-qpcr_uid, -replicate_nb, -starts_with("ext_lib")) |> 
    distinct() |>  
    arrange(uid)
  
  coldata <-  
    coldata |> 
    dplyr::left_join(
      filtered_qpcr |> 
        colData() |> 
        as.data.frame() |> 
        as_tibble() |> 
        select(uid, starts_with("ext_lib")) |> 
        group_by(uid) |> 
        summarize(
          ext_lib_plate_nb = str_c(ext_lib_plate_nb |> unique() |> sort(), collapse = ", "),
          # ext_lib_plate_id = str_c(ext_lib_plate_id |> str_replace_na("unknown") |> unique() |> sort(), collapse = ", "),
          .groups = "drop"
        ),
      by = join_by(uid)
    )
  
   coldata <-  
    coldata |> 
    as.data.frame() |> 
    mutate(rownames = uid) |> 
    column_to_rownames("rownames")
  
  # ROWDATA
   rowdata <-
     filtered_qpcr |> 
     rowData() |> 
     as.data.frame()
    
   SE <- SummarizedExperiment(
     assays = list(
       n_non_na = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = n_non_na) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
        dilution = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = dilution) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_med = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_med) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_mean = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_mean) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_cv = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_cv) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix()
     ),
     colData = coldata,
     rowData = rowdata,
     metadata = list(
       description = "Aggregated qPCR data from the VIBRANT study",
       date = today(),
       assay_and_coldata_dictionary = SE_qPCR_raw@metadata$assay_and_coldata_dictionary
     )
   )
   
   SE
     
}
Code
SE_qPCR_agg <- agg_qpcr_SE(SE_qPCR_raw = SE_qPCR_raw)
SE_qPCR_agg
# A SummarizedExperiment-tibble abstraction: 16,464 × 25
# Features=16 | Samples=1029 | Assays=n_non_na, dilution, copies_per_swab_med,
#   copies_per_swab_mean, copies_per_swab_cv
   .feature .sample   n_non_na dilution copies_per_swab_med copies_per_swab_mean
   <chr>    <chr>        <int>    <dbl>               <dbl>                <dbl>
 1 C0175A1  06810000…        3        5                   0                    0
 2 C0022A1  06810000…        3        5                   0                    0
 3 C0059E1  06810000…        3        5                   0                    0
 4 UC101    06810000…        3        5                   0                    0
 5 FF00018  06810000…        3        5                   0                    0
 6 FF00051  06810000…        3        5                   0                    0
 7 185329   06810000…        3       20                   0                    0
 8 C0028A1  06810000…        3       20                   0                    0
 9 C0112A1  06810000…        3       20                   0                    0
10 FF00004  06810000…        3       20                   0                    0
# ℹ 310 more rows
# ℹ 19 more variables: copies_per_swab_cv <dbl>, uid <chr>, mg_pid <chr>,
#   mg_visit_code <chr>, qpcr_sample_type <fct>, sample_type <chr>,
#   control_type <chr>, qpcr_sample_id <chr>, vibr_sample_id <chr>,
#   ext_lib_plate_nb <chr>, target <fct>, fluor <chr>, strain_group_qpcr <chr>,
#   taxon_label <chr>, LBP <fct>, strain_id <fct>, strain_origin <fct>,
#   biose_id <chr>, pcr_plate_ids <chr>

Save SummarizedExperiment objects

We first check that the SE_qPCR_agg object is formatted as it should for its integration in the MAE.

Code
# We remove `mg_pid` and `mg_visit_code` from the colData to avoid conflict when merging with the metagenomics data.
colData(SE_qPCR_agg) <- colData(SE_qPCR_agg)[, -which(colnames(colData(SE_qPCR_agg)) %in% c("mg_pid", "mg_visit_code"))]
# We check that the `SE_qPCR_agg` object is formatted as it should
SE_qPCR_agg <- check_se(SE_qPCR_agg)

Save the SE objects to disk

Code
saveRDS(
  SE_qPCR_raw, 
  str_c(
    get_01_output_dir(),  
    "03_se_pcr_raw_", today() |> str_remove_all("-"), ".rds"
    )
  )


saveRDS(
  SE_qPCR_agg, 
  str_c(
    get_01_output_dir(),  
    "03_se_pcr_agg_", today() |> str_remove_all("-"), ".rds"
    )
  )